# Bias, variance, and Noise trade-off

In [1]:
# Code source: Sebastian Curi and Andreas Krause, based on Jaques Grobler (sklearn demos).
# License: BSD 3 clause

# We start importing some modules and running some magic commands
% matplotlib inline
% reload_ext autoreload
% load_ext autoreload
% autoreload 2

# General math and plotting modules.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

# Project files.
from util import gradient_descent, generate_polynomial_data
import plot_helpers
from regressors import LinearRegressor
from regularizers import Regularizer, L2Regularizer

# Widget and formatting modules
import ipywidgets
from ipywidgets import interact, interactive, interact_manual, fixed
import pylab
# If in your browser the figures are not nicely vizualized, change the following line. 
pylab.rcParams['figure.figsize'] = (10, 5)

# Machine Learning library. 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
def true_fun(X):
    return np.cos(1.5 * np.pi * X)

def bias_variance_to(n_samples, degree, noise):
    np.random.seed(0)

    X = np.sort(np.random.rand(n_samples))
    y = true_fun(X) + np.random.randn(n_samples) * noise

    polynomial_features = PolynomialFeatures(degree=degree,
                                             include_bias=True)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    # Evaluate the models using crossvalidation
    scores = cross_val_score(pipeline, X[:, np.newaxis], y,
                             scoring="neg_mean_squared_error", cv=10)

    X_test = np.linspace(0, 1, 100)
    plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    plt.plot(X_test, true_fun(X_test), label="True function")
    plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.xlim((0, 1))
    plt.ylim((-2, 2))
    plt.legend(loc="best")
    plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
        degree, -scores.mean(), scores.std()))
    plt.show()
    
interact(bias_variance_to, 
        n_samples=ipywidgets.IntSlider(value=30,
                                         min=30,
                                         max=300,
                                         step=1,
                                         description='Number of samples:',
                                         style={'description_width': 'initial'},
                                         continuous_update=False),
        degree=ipywidgets.IntSlider(value=1,
                                         min=1,
                                         max=15,
                                         step=1,
                                         description='Polynomial Degree:',
                                         style={'description_width': 'initial'},
                                         continuous_update=False),
         noise=ipywidgets.FloatSlider(value=0.1,
                                      min=0,
                                      max=1,
                                      step=0.1,
                                      readout_format='.1f',
                                      description='Noise level:',
                                      style={'description_width': 'initial'},
                                      continuous_update=False),);

In [3]:
degrees = np.arange(1, 10, 1)
def bias_variance_to(noise):
    n_samples = 300
    np.random.seed(0)
    score = []

    for degree in degrees:
        X = np.sort(np.random.rand(n_samples))
        y = true_fun(X) + np.random.randn(n_samples) * noise

        polynomial_features = PolynomialFeatures(degree=degree,
                                                 include_bias=True)
        linear_regression = LinearRegression()
        pipeline = Pipeline([("polynomial_features", polynomial_features),
                             ("linear_regression", linear_regression)])
        pipeline.fit(X[:, np.newaxis], y)

        # Evaluate the models using crossvalidation
        scores = cross_val_score(pipeline, X[:, np.newaxis], y,
                                 scoring="neg_mean_squared_error", cv=10)

        score.append(-scores.mean())

    plt.plot(degrees, score)
    plt.ylabel('MSE')
    plt.xlabel('Polynomial degree (Model Complexity)')
    
interact(bias_variance_to,
         noise=ipywidgets.FloatSlider(value=0.1,
                                      min=0,
                                      max=1,
                                      step=0.1,
                                      readout_format='.1f',
                                      description='Noise level:',
                                      style={'description_width': 'initial'},
                                      continuous_update=False),);